
rm(list=ls())
## load tsv data table

require(data.table)
data<- data.frame(fread("Bordbar/meta_23_original.tsv"))

## 1.  remove metabolites with unknown chebi ID
data_chebi = data[sapply(data[,1], function(x) grepl("CHEBI", x) ),]

## 2.  remove columns with NAs
data_nonna = data_chebi[,!apply(is.na(data_chebi), 2, any)]

## 3. remove metabolites with >=50% zeros
data_metab = apply(data_nonna[,8:19], c(1,2), as.numeric)
metab_delete = rowMeans(data_metab==0) >= 0.5
data_23 = data_nonna[!metab_delete,]


## 4. save the clean data
library(xlsx)
write.xlsx(data_23,"Bordbar/meta_23.xlsx")

## save chebi IDs for generating pathways
chebi_23 = data_23$database_identifier
write.table(chebi_23, file = "Bordbar/chebi_23.txt",row.names = F,col.names = F, quote = FALSE)

## 5. save the data matrix
data_matrix = t(data_metab[!metab_delete,])
rownames(data_matrix)
colnames(data_matrix) = data_23$metabolite_identification

## 6. normalized the data
hist(data_matrix)
if(min(data_matrix)>0){
  data_matrix_normal = log2(data_matrix)
  hist(data_matrix_normal)
} else{
  a = 1 ## default value, could be other values 
  data_matrix_normal = log2((data_matrix + sqrt(data_matrix^2 + a^2))/2)
  hist(data_matrix_normal)
}

y = as.numeric(grepl("Control",rownames(data_matrix_normal)))
d23 =  cbind(data_matrix_normal, y) 
identical(colnames(data_matrix_normal), data_23$metabolite_identification)

save(d23, file="Bordbar/d23.RData")





############################   ========================= smpdb_HMDB_pathway =========================   ############################ 

smpdbhmdb <- data.frame(fread("Bordbar/mbrole2_smpdbhmdb.tsv"))
smpdbhmdbpath_chebi = smpdbhmdb$Submitted.IDs
names(smpdbhmdbpath_chebi) = smpdbhmdb$ID.Annotation
listsmpdbhmdbpath = lapply(smpdbhmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listsmpdbhmdbpath_uniq = listsmpdbhmdbpath[which(!duplicated(listsmpdbhmdbpath))]

## map cheiID to metabolites identifies
smpdbhmdbpath <- lapply(listsmpdbhmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_23$metabolite_identification[which(chebi_23 %in% x )] )

smpdbhmdbpath_metab = smpdbhmdbpath[which(!duplicated(smpdbhmdbpath))]

save(smpdbhmdbpath_metab, file="Bordbar/smpdbhmdbpath_metab_23.RData")




############################   ========================= Biocycpathway =========================   ############################ 

biocyc <- data.frame(fread("Bordbar/mbrole2_biocyc.tsv"))
biocycpath_chebi = biocyc$Submitted.IDs
names(biocycpath_chebi) = biocyc$ID.Annotation
listbiocycpath = lapply(biocycpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listbiocycpath_uniq = listbiocycpath[which(!duplicated(listbiocycpath))]

## map cheiID to metabolites identifies
biocycpath <- lapply(listbiocycpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_23$metabolite_identification[which(chebi_23 %in% x )] )

biocycpath_metab = biocycpath[which(!duplicated(biocycpath))]

save(biocycpath_metab, file="Bordbar/biocycpath_metab_23.RData")



############################   ========================= keggpathway =========================   ############################ 

kegg<- data.frame(fread("Bordbar/mbrole2_kegg.tsv"))
keggpath_chebi = kegg$Submitted.IDs
names(keggpath_chebi) = kegg$ID.Annotation
listkeggpath = lapply(keggpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )

## remove duplicate the pathways
listkeggpath_uniq = listkeggpath[which(!duplicated(listkeggpath))]

## map cheiID to metabolites identifies
keggpath <- lapply(listkeggpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_23$metabolite_identification[which(chebi_23 %in% x )] )

keggpath_metab = keggpath[which(!duplicated(keggpath))]

save(keggpath_metab, file="Bordbar/keggpath_metab_23.RData")



############################   ========================= wikipathway =========================   ############################ 
library(rWikiPathways)
listMetab <- NULL
Pathways<-listPathwayIds(organism="Mus musculus")
wikipath_info<-data.frame(id=1:length(Pathways))
wikipath_info$names<-Pathways
#list of chebi IDs per pathway
listMetab<-apply(as.matrix(Pathways),1, function(x) getXrefList(pathway = x, systemCode="Ce"))

#remove chebiID character
listMetab<-lapply(listMetab, function(x) gsub('^\\CHEBI:|\\$', '', x))
listMetab<-lapply(listMetab, function(x) unique(x))
wikipath_info$chebi_path = listMetab
wikipath_info$path_length<-sapply(listMetab, function(x) length(x))


#map chebi IDs to  your own metabolite names
wikiMetab<-lapply(listMetab, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
  data_23$metabolite_identification[which(sapply(chebi_23, function(x) gsub('^\\CHEBI:|\\$', '', x)) %in% x)])  

# remove empty pathways
wikipath = wikiMetab[sapply(wikiMetab,length)>0]
wikipath_uniq = wikipath[which(!duplicated(wikipath))]
save(wikipath_uniq, file="Bordbar/wikipath_metab_23.RData")



# ############################   ========================= smpdb_ECMDB_pathway =========================   ############################ 
# 
# smpdbecmdb <- data.frame(fread("Bordbar/mbrole2_smpdbecmdb.tsv"))
# smpdbecmdbpath_chebi = smpdbecmdb$Submitted.IDs
# names(smpdbecmdbpath_chebi) = smpdbecmdb$ID.Annotation
# listsmpdbecmdbpath = lapply(smpdbecmdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbecmdbpath_uniq = listsmpdbecmdbpath[which(!duplicated(listsmpdbecmdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbecmdbpath <- lapply(listsmpdbecmdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_23$metabolite_identification[which(chebi_23 %in% x )] )
# 
# smpdbecmdbpath_metab = smpdbecmdbpath[which(!duplicated(smpdbecmdbpath))]
# 
# save(smpdbecmdbpath_metab, file="Bordbar/smpdbecmdbpath_metab_23.RData")
# 
# 
# 
# 
# ############################   ========================= smpdb_YMDB_pathway =========================   ############################ 
# 
# smpdbymdb <- data.frame(fread("Bordbar/mbrole2_smpdbymdb.tsv"))
# smpdbymdbpath_chebi = smpdbymdb$Submitted.IDs
# names(smpdbymdbpath_chebi) = smpdbymdb$ID.Annotation
# listsmpdbymdbpath = lapply(smpdbymdbpath_chebi, function(x) unlist(strsplit(x, "|", fixed = TRUE)) )
# 
# ## remove duplicate the pathways
# listsmpdbymdbpath_uniq = listsmpdbymdbpath[which(!duplicated(listsmpdbymdbpath))]
# 
# ## map cheiID to metabolites identifies
# smpdbymdbpath <- lapply(listsmpdbymdbpath_uniq, function(x) ## for each wikipathway, to retrieve which metabolites of my data are belong to this pathway
#   data_23$metabolite_identification[which(chebi_23 %in% x )] )
# 
# smpdbymdbpath_metab = smpdbymdbpath[which(!duplicated(smpdbymdbpath))]
# 
# save(smpdbymdbpath_metab, file="Bordbar/smpdbymdbpath_metab_23.RData")






